Speaker: Ruth Chirinos
#install.packages("dplyr")
library(tidyverse)
library(nycflights13)
flights
#
?flights
More ways to learn basic info on a data.frame.
names(flights)
[1] "year" "month" "day" "dep_time" "sched_dep_time" "dep_delay"
[7] "arr_time" "sched_arr_time" "arr_delay" "carrier" "flight" "tailnum"
[13] "origin" "dest" "air_time" "distance" "hour" "minute"
[19] "time_hour"
dim(flights) # ?dim dimension
[1] 336776 19
ncol(flights) # ?ncol number of columns
[1] 19
nrow(flights) # ?nrow number of rows
[1] 336776
summary(flights)
year month day dep_time sched_dep_time dep_delay arr_time
Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 106 Min. : -43.00 Min. : 1
1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 906 1st Qu.: -5.00 1st Qu.:1104
Median :2013 Median : 7.000 Median :16.00 Median :1401 Median :1359 Median : -2.00 Median :1535
Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349 Mean :1344 Mean : 12.64 Mean :1502
3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729 3rd Qu.: 11.00 3rd Qu.:1940
Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359 Max. :1301.00 Max. :2400
NA's :8255 NA's :8255 NA's :8713
sched_arr_time arr_delay carrier flight tailnum origin
Min. : 1 Min. : -86.000 Length:336776 Min. : 1 Length:336776 Length:336776
1st Qu.:1124 1st Qu.: -17.000 Class :character 1st Qu.: 553 Class :character Class :character
Median :1556 Median : -5.000 Mode :character Median :1496 Mode :character Mode :character
Mean :1536 Mean : 6.895 Mean :1972
3rd Qu.:1945 3rd Qu.: 14.000 3rd Qu.:3465
Max. :2359 Max. :1272.000 Max. :8500
NA's :9430
dest air_time distance hour minute time_hour
Length:336776 Min. : 20.0 Min. : 17 Min. : 1.00 Min. : 0.00 Min. :2013-01-01 05:00:00
Class :character 1st Qu.: 82.0 1st Qu.: 502 1st Qu.: 9.00 1st Qu.: 8.00 1st Qu.:2013-04-04 13:00:00
Mode :character Median :129.0 Median : 872 Median :13.00 Median :29.00 Median :2013-07-03 10:00:00
Mean :150.7 Mean :1040 Mean :13.18 Mean :26.23 Mean :2013-07-03 05:22:54
3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:17.00 3rd Qu.:44.00 3rd Qu.:2013-10-01 07:00:00
Max. :695.0 Max. :4983 Max. :23.00 Max. :59.00 Max. :2013-12-31 23:00:00
NA's :9430
#install.packages('skimr')
library(skimr)
skim(flights)
── Data Summary ────────────────────────
Values
Name flights
Number of rows 336776
Number of columns 19
_______________________
Column type frequency:
character 4
numeric 14
POSIXct 1
________________________
Group variables None
── Variable type: character ──────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate min max empty n_unique whitespace
1 carrier 0 1 2 2 0 16 0
2 tailnum 2512 0.993 5 6 0 4043 0
3 origin 0 1 3 3 0 3 0
4 dest 0 1 3 3 0 105 0
── Variable type: numeric ────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
1 year 0 1 2013 0 2013 2013 2013 2013 2013 ▁▁▇▁▁
2 month 0 1 6.55 3.41 1 4 7 10 12 ▇▆▆▆▇
3 day 0 1 15.7 8.77 1 8 16 23 31 ▇▇▇▇▆
4 dep_time 8255 0.975 1349. 488. 1 907 1401 1744 2400 ▁▇▆▇▃
5 sched_dep_time 0 1 1344. 467. 106 906 1359 1729 2359 ▁▇▇▇▃
6 dep_delay 8255 0.975 12.6 40.2 -43 -5 -2 11 1301 ▇▁▁▁▁
7 arr_time 8713 0.974 1502. 533. 1 1104 1535 1940 2400 ▁▃▇▇▇
8 sched_arr_time 0 1 1536. 497. 1 1124 1556 1945 2359 ▁▃▇▇▇
9 arr_delay 9430 0.972 6.90 44.6 -86 -17 -5 14 1272 ▇▁▁▁▁
10 flight 0 1 1972. 1632. 1 553 1496 3465 8500 ▇▃▃▁▁
11 air_time 9430 0.972 151. 93.7 20 82 129 192 695 ▇▂▂▁▁
12 distance 0 1 1040. 733. 17 502 872 1389 4983 ▇▃▂▁▁
13 hour 0 1 13.2 4.66 1 9 13 17 23 ▁▇▇▇▅
14 minute 0 1 26.2 19.3 0 8 29 44 59 ▇▃▆▃▅
── Variable type: POSIXct ────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate min max median n_unique
1 time_hour 0 1 2013-01-01 05:00:00 2013-12-31 23:00:00 2013-07-03 10:00:00 6936
Ver el tibble (data frame)
#View(flights)
glimpse(flights)
Observations: 336,776
Variables: 19
$ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 20…
$ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, 558, 558, 558, 559, 559, 559, 600, 60…
$ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, 600, 600, 600, 600, 559, 600, 600, 60…
$ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1, 0, -1, 0, 0, 1, -8, -3, -4, -4, 0, …
$ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849, 853, 924, 923, 941, 702, 854, 851, 8…
$ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851, 856, 917, 937, 910, 706, 902, 858, 8…
$ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -14, 31, -4, -8, -7, 12, -6, -8, 16, -1…
$ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "AA", "B6", "B6", "UA", "UA", "AA", "B…
$ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 49, 71, 194, 1124, 707, 1806, 1187, 37…
$ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N39463", "N516JB", "N829AS", "N593JB", "N…
$ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA", "JFK", "LGA", "JFK", "JFK", "JFK", "…
$ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD", "MCO", "ORD", "PBI", "TPA", "LAX", "…
$ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 158, 345, 361, 257, 44, 337, 152, 134,…
$ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, 1028, 1005, 2475, 2565, 1389, 187, 22…
$ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,…
$ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0, 0, 0, 0, 10, 5, 10, 10, 7, 0, 0, 10…
$ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01…
filter(flights, month == 1, day == 1)
dplyr functions never modify their inputs, so if you want to save the result, you’ll need to use the assignment operator, <-
( jan1 <- filter(flights, month == 1, day == 1) )
NA
# filter(flights, month = 1)
sqrt(2) ^ 2 == 2
[1] FALSE
1 / 49 * 49 == 1
[1] FALSE
#
near(sqrt(2) ^ 2, 2)
[1] TRUE
near(1 / 49 * 49, 1)
[1] TRUE
Boolean operators: & is “and”, | is “or”, and ! is “no”
filter(flights, month == 11 | month == 12)
Example with %in%
( nov_dec <- filter(flights, month %in% c(11, 12)) )
If you wanted to find flights that weren’t delayed (on arrival or departure) by more than two hours, you could use either of the following two filters:
filter(flights, !(arr_delay > 120 | dep_delay > 120))
filter(flights, arr_delay <= 120, dep_delay <= 120)
NA > 5
[1] NA
#> [1] NA
10 == NA
[1] NA
#> [1] NA
NA + 10
[1] NA
#> [1] NA
NA / 2
[1] NA
#> [1] NA
NA == NA
#> [1] NA
x <- NA
y <- NA
x == y
#> [1] NA
# We don't know!
is.na(x)
df <- tibble(x = c(1, NA, 3))
df
filter(df, x > 1)
filter(df, is.na(x) | x > 1)
arrange(flights, year, month, day)
arrange(flights, desc(dep_delay))
Missing Values
df <- tibble(x = c(5, 2, NA))
arrange(df, x)
arrange(df, desc(x))
select(flights, year, month, day)
select(flights, year:day)
select(flights, -(year:day))
rename(flights, tail_num = tailnum)
select(flights, time_hour, air_time, everything())
flights_sml <- select(flights,
year:day,
ends_with("delay"),
distance,
air_time
)
mutate(flights_sml,
gain = dep_delay - arr_delay,
speed = distance / air_time * 60
)
Note that you can refer to columns that you’ve just created:
mutate(flights_sml,
gain = dep_delay - arr_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)
If you only want to keep the new variables, use transmute():
transmute(flights,
gain = dep_delay - arr_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)
(by_day <- group_by(flights, year, month, day))
summarise(flights, delay = mean(dep_delay, na.rm = TRUE))
(by_day <- group_by(flights, year, month, day))
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
delay <- flights %>%
group_by(dest) %>%
summarise(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL")
#
ggplot(data = delay, mapping = aes(x = dist, y = delay)) +
geom_point(aes(size = count), alpha = 1/3) +
geom_smooth(se = FALSE)
flights %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay, na.rm = TRUE))
not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
#
not_cancelled %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay))
The story is actually a little more nuanced. We can get more insight if we draw a scatterplot of number of flights vs. average delay:
delays <- not_cancelled %>%
group_by(tailnum) %>%
summarise(
delay = mean(arr_delay, na.rm = TRUE),
n = n()
)
ggplot(data = delays, mapping = aes(x = n, y = delay)) +
geom_point(alpha = 1/10)
With a filter n > 25
delays %>%
filter(n > 25) %>%
ggplot(mapping = aes(x = n, y = delay)) +
geom_point(alpha = 1/10) # +
#geom_smooth(se = FALSE)
Measures of location: we’ve used mean(x), but median(x) is also useful.
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
avg_delay1 = mean(arr_delay),
avg_delay2 = mean(arr_delay[arr_delay > 0]) # the average positive delay
)
Measures of spread: sd(x), IQR(x), mad(x)
not_cancelled %>%
group_by(dest) %>%
summarise(distance_sd = sd(distance)) %>%
arrange(desc(distance_sd))
Measures of rank: min(x), quantile(x, 0.25), max(x)
# When do the first and last flights leave each day?
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
first = min(dep_time),
last = max(dep_time)
)
Measures of position: first(x), nth(x, 2), last(x)
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
first_dep = first(dep_time),
last_dep = last(dep_time)
)
Filtering on Ranks
not_cancelled %>%
group_by(year, month, day) %>%
mutate(r = min_rank(desc(dep_time))) %>%
filter(r %in% range(r))
Counts: You’ve seen n(),
# Which destinations have the most carriers?
not_cancelled %>%
group_by(dest) %>%
summarise(carriers = n_distinct(carrier)) %>%
arrange(desc(carriers))
Counts are so useful that dplyr provides a simple helper if all you want is a count:
not_cancelled %>%
count(dest)
Counts and proportions of logical values: sum(x > 10), mean(y == 0)
# How many flights left before 5am? (these usually indicate delayed
# flights from the previous day)
not_cancelled %>%
group_by(year, month, day) %>%
summarise(n_early = sum(dep_time < 500))
# What proportion of flights are delayed by more than an hour?
not_cancelled %>%
group_by(year, month, day) %>%
summarise(hour_prop = mean(arr_delay > 60))
When you group by multiple variables, each summary peels off one level of the grouping. That makes it easy to progressively roll up a dataset:
daily <- group_by(flights, year, month, day)
(per_day <- summarise(daily, flights = n()))
#
(per_month <- summarise(per_day, flights = sum(flights)))
#
(per_year <- summarise(per_month, flights = sum(flights)))
NA
daily
#
daily %>%
ungroup() %>% # no longer grouped by date
summarise(flights = n()) # all flights
flights_sml %>%
group_by(year, month, day) %>%
filter(rank(desc(arr_delay)) < 10)
popular_dests <- flights %>%
group_by(dest) %>%
filter(n() > 365)
#
popular_dests
popular_dests %>%
filter(arr_delay > 0) %>%
mutate(prop_delay = arr_delay / sum(arr_delay)) %>%
select(year:day, dest, arr_delay, prop_delay)